# -*- coding: utf-8 -*-
"""
Created on Sun Jul 11 14:31:31 2021

@author: DL

shows steps to be taken once sc_md1.pickle has been already created,
i.e., how to get sentence counts from included .pickle file.

"""

import os
import nltk.tokenize.punkt
import pickle
#import codecs
from nltk import word_tokenize


#must place sc_md1.pickle file in nltk_data folder where nltk can find it,
#likely located at C://nltk_data (or similar). 
s_t=nltk.data.load('sc_md1.pickle')



ne=set()
ne=({'Mr','v','cf','nom','Mrs','Ms','Dr','Drs','Prof','Profs','Sen','Rep','Rev',
'Pres','Sgt','Lt','Pfc','Col','St','Fed','Art'})



os.chdir("V:\\docs\\sc_ops\\testfolder_preproc_nn")
#set to location of folder of preprocessed opinions--
#remove extraneous punctuation, quotation marks, parentheses, stars etc.
#keep question marks, periods, exclamation marks.  
#safer to remove numerals too.

for f in os.listdir('.'):
    runct=0
    uf=open(f).read()
    s=s_t.tokenize(uf)
    for i in range(0,len(s)):
        w_si=word_tokenize(s[i])
        if i !=len(s)-1:
            w_sj=word_tokenize(s[i+1])
            if (len(w_sj)>2 and (len(w_si)<3 or (w_si[max(-len(w_si),-2)] not in ne
            and w_si[-1][:-1] not in ne)) and w_sj[0].islower()==False
            and (w_sj[0][0].isalpha()==True or w_sj[0][0]=="'" or w_sj[0][0]=='`'or w_sj[0][0]=='''"''')
            and ((w_si[max(-1*len(w_si),-2)].lower() not in s_t._params.abbrev_types 
            and w_si[-1][:-1] not in s_t._params.abbrev_types) 
            or (w_sj[min(len(w_sj)-1,1)]!="." and w_sj[0][-1]!="."))):
                runct+=1
                #print(s[i])
            else:
                pass                
                #print("NOT A SENTENCE:  "+f+":::"+str(i)+" or "+str(i+1))
                #print(s[i]+"**OR**"+s[i+1])
                #could structure to halve the number of word_tokenizations, but may not be worth it.
        else:
            runct+=1
            #print(f+":::"+str(runct))
            df=open("sent_counts.txt","a")
            df.write(str(f[:-4])+","+str(runct)+"\n")
            #df.write(":::")
            #df.write(str(i))
            #df.write("::")
            #df.write(x[i])
            #df.write("\n\n\n\n\n")
            df.close()
        i+=1

